Let’s first take a look at the variables
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
###Visualization with ggplot
dataset <- read_csv("Exam_Score_Prediction.csv")
## Rows: 20000 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): gender, course, internet_access, sleep_quality, study_method, facil...
## dbl (6): student_id, age, study_hours, class_attendance, sleep_hours, exam_s...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(dataset) #shows "vectors"
## Rows: 20,000
## Columns: 13
## $ student_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ age <dbl> 17, 23, 22, 20, 20, 23, 17, 22, 18, 17, 21, 24, 22, 2…
## $ gender <chr> "male", "other", "male", "other", "female", "male", "…
## $ course <chr> "diploma", "bca", "b.sc", "diploma", "diploma", "b.te…
## $ study_hours <dbl> 2.78, 3.37, 7.88, 0.67, 0.89, 3.48, 1.35, 5.48, 2.89,…
## $ class_attendance <dbl> 92.9, 64.8, 76.8, 48.4, 71.6, 65.4, 69.0, 51.1, 92.0,…
## $ internet_access <chr> "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes…
## $ sleep_hours <dbl> 7.4, 4.6, 8.5, 5.8, 9.8, 4.2, 7.4, 8.2, 6.6, 9.8, 5.8…
## $ sleep_quality <chr> "poor", "average", "poor", "average", "poor", "good",…
## $ study_method <chr> "coaching", "online videos", "coaching", "online vide…
## $ facility_rating <chr> "low", "medium", "high", "low", "low", "low", "high",…
## $ exam_difficulty <chr> "hard", "moderate", "moderate", "moderate", "moderate…
## $ exam_score <dbl> 58.9, 54.8, 90.3, 29.7, 43.7, 58.2, 53.7, 47.3, 44.9,…
dim(dataset) #shows nrows and ncol
## [1] 20000 13
str(dataset) #shows the type of variables
## spc_tbl_ [20,000 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ student_id : num [1:20000] 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num [1:20000] 17 23 22 20 20 23 17 22 18 17 ...
## $ gender : chr [1:20000] "male" "other" "male" "other" ...
## $ course : chr [1:20000] "diploma" "bca" "b.sc" "diploma" ...
## $ study_hours : num [1:20000] 2.78 3.37 7.88 0.67 0.89 3.48 1.35 5.48 2.89 6.77 ...
## $ class_attendance: num [1:20000] 92.9 64.8 76.8 48.4 71.6 65.4 69 51.1 92 44.8 ...
## $ internet_access : chr [1:20000] "yes" "yes" "yes" "yes" ...
## $ sleep_hours : num [1:20000] 7.4 4.6 8.5 5.8 9.8 4.2 7.4 8.2 6.6 9.8 ...
## $ sleep_quality : chr [1:20000] "poor" "average" "poor" "average" ...
## $ study_method : chr [1:20000] "coaching" "online videos" "coaching" "online videos" ...
## $ facility_rating : chr [1:20000] "low" "medium" "high" "low" ...
## $ exam_difficulty : chr [1:20000] "hard" "moderate" "moderate" "moderate" ...
## $ exam_score : num [1:20000] 58.9 54.8 90.3 29.7 43.7 58.2 53.7 47.3 44.9 77.7 ...
## - attr(*, "spec")=
## .. cols(
## .. student_id = col_double(),
## .. age = col_double(),
## .. gender = col_character(),
## .. course = col_character(),
## .. study_hours = col_double(),
## .. class_attendance = col_double(),
## .. internet_access = col_character(),
## .. sleep_hours = col_double(),
## .. sleep_quality = col_character(),
## .. study_method = col_character(),
## .. facility_rating = col_character(),
## .. exam_difficulty = col_character(),
## .. exam_score = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(dataset) #shows statistics quantiles
## student_id age gender course
## Min. : 1 Min. :17.00 Length:20000 Length:20000
## 1st Qu.: 5001 1st Qu.:18.00 Class :character Class :character
## Median :10000 Median :20.00 Mode :character Mode :character
## Mean :10001 Mean :20.47
## 3rd Qu.:15000 3rd Qu.:22.00
## Max. :20001 Max. :24.00
## study_hours class_attendance internet_access sleep_hours
## Min. :0.080 Min. :40.60 Length:20000 Min. :4.100
## 1st Qu.:2.000 1st Qu.:55.10 Class :character 1st Qu.:5.500
## Median :4.040 Median :69.90 Mode :character Median :7.000
## Mean :4.008 Mean :70.02 Mean :7.009
## 3rd Qu.:6.000 3rd Qu.:85.00 3rd Qu.:8.500
## Max. :7.910 Max. :99.40 Max. :9.900
## sleep_quality study_method facility_rating exam_difficulty
## Length:20000 Length:20000 Length:20000 Length:20000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## exam_score
## Min. : 19.60
## 1st Qu.: 48.80
## Median : 62.60
## Mean : 62.51
## 3rd Qu.: 76.30
## Max. :100.00
head(dataset) #shows titles and first rows
## # A tibble: 6 × 13
## student_id age gender course study_hours class_attendance internet_access
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <chr>
## 1 1 17 male diploma 2.78 92.9 yes
## 2 2 23 other bca 3.37 64.8 yes
## 3 3 22 male b.sc 7.88 76.8 yes
## 4 4 20 other diploma 0.67 48.4 yes
## 5 5 20 female diploma 0.89 71.6 yes
## 6 6 23 male b.tech 3.48 65.4 yes
## # ℹ 6 more variables: sleep_hours <dbl>, sleep_quality <chr>,
## # study_method <chr>, facility_rating <chr>, exam_difficulty <chr>,
## # exam_score <dbl>
#Let's begin by taking a look at the data
#which are the variables?
dataset |>
select(where(is.numeric)) |>
names()
## [1] "student_id" "age" "study_hours" "class_attendance"
## [5] "sleep_hours" "exam_score"
library(ggplot2)
ggplot(dataset, aes(x = study_hours, y = exam_score)) +
geom_point(alpha = 0.3, color = "gray30", size = 1) +
geom_smooth(method = "lm", se = FALSE) +
labs(
x = "Study hours",
y = "Exam score",
title = "Exam score vs study hours"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(dataset, aes(x = study_hours, y = exam_score, color = gender)) +
geom_point(alpha = 0.6, size = 1)+
geom_smooth(method = "lm", se = FALSE) +
labs(
x = "Study hours",
y = "Exam score",
title = "Exam scores vs Study hours"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(dataset, aes(x = sleep_hours, y = exam_score)) +
geom_point(alpha = 0.3, color = "gray30", size = 1) +
geom_smooth(method = "lm", se = FALSE) +
labs(
x = "Sleep hours",
y = "Exam score",
title = "Exam score vs sleep hours"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(dataset, aes(x = class_attendance, y = exam_score)) +
geom_point(alpha = 0.3, color = "gray30", size = 1) +
geom_smooth(method = "lm", se = TRUE) +
labs(
x = "Class Attendance",
y = "Exam score",
title = "Exam score vs class attendance"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# same chart, different colors for gender and course
# first, we need to create a factor
dataset <- dataset |>
mutate(
gender = factor(gender),
course = factor(course)
)
# also works: dataset$gender <- factor(dataset$gender)
dataset <- dataset |>
mutate(
study_method = factor(study_method)
)
dataset <- dataset |>
mutate(
sleep_quality = factor(sleep_quality)
)
#then we plot them with different colors
ggplot(dataset, aes(x = study_hours, y = exam_score, color = gender)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE) +
labs(
x = "Study hours",
y = "Exam score",
title = "Exam score vs study hours"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(dataset, aes(x=class_attendance, y = exam_score, color = gender)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = TRUE) +
labs(
x = "Class attendance",
y = "Exam score",
title = "Exam score vs class attendance"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(dataset, aes(x = study_hours, y = exam_score, color = gender)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = TRUE) +
labs(
x = "Study hours",
y = "Exam score",
title = "Exam score vs study hours by gender"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#taking a look of what is inside the variables: study_method and
table(dataset$study_method)
##
## coaching group study mixed online videos self-study
## 4036 3922 3894 4069 4079
table(dataset$sleep_quality)
##
## average good poor
## 6694 6619 6687
table(dataset$course)
##
## b.com b.sc b.tech ba bba bca diploma
## 2864 2878 2798 2896 2836 2902 2826
table(dataset$internet_access)
##
## no yes
## 3012 16988
table(dataset$facility_rating)
##
## high low medium
## 6602 6638 6760
table(dataset$exam_difficulty)
##
## easy hard moderate
## 6141 3981 9878
# Let's visualize :)
ggplot(dataset, aes(x = study_method)) +
geom_bar(fill = "blue") +
labs(
x = "Study method",
y = "Number of students",
title = "Distribution of study methods"
) +
theme_minimal()
ggplot(dataset, aes(x = sleep_quality)) +
geom_bar(fill = "blue") +
labs(
x = "Sleep quality",
y = "number of students",
title = "Sleep quality frequency"
) +
theme_minimal()
dataset$sleep_quality <- factor(
dataset$sleep_quality,
levels = c("poor", "average", "good")
)
ggplot(dataset, aes(x = sleep_quality)) +
geom_bar(fill = "blue") +
labs(
x = "Sleep quality",
y = "Number of students",
title = "Distribution of Students"
) +
theme_minimal()
ggplot(dataset, aes(x = study_method, y = exam_score)) +
geom_boxplot(fill = "grey80", color = "grey20") +
labs(
x = "Study method",
y = "Exam score",
title = "Exam score by study method"
) +
theme_minimal()
dataset$exam_difficulty <- factor(
dataset$exam_difficulty,
levels = c("easy", "moderate", "hard")
)
ggplot(dataset, aes(x = exam_difficulty, y = exam_score)) +
geom_boxplot(fill = "grey80", color = "grey20") +
labs(
x = "Exam difficulty",
y = "Number of students",
title = "Distribution of students"
) +
theme_minimal()
dataset$facility_rating <- factor(
dataset$facility_rating,
levels = c("low", "medium", "high")
)
ggplot(dataset, aes(x = facility_rating, y = exam_score)) +
geom_boxplot(fill = "grey80", color = "grey20") +
labs(
x = "Facility rating",
y = "Exam score",
title = "Distribution of Exam score per facility rating"
) +
theme_minimal()
dataset$internet_access <- factor(dataset$internet_access)
ggplot(dataset, aes(x = exam_difficulty, y = exam_score)) +
geom_boxplot(fill = "grey90", color = "grey20") +
labs(
x = "Exam difficulty",
y = "Exam score",
title = "Exam score vs exam difficulty"
) +
theme_minimal()